# import semua package / module yang diperlukan
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
df = pd.read_csv('Airplane_Crashes_and_Fatalities_Since_1908.csv')
df.head()
| Date | Time | Location | Operator | Flight # | Route | Type | Registration | cn/In | Aboard | Fatalities | Ground | Summary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 09/17/1908 | 17:18 | Fort Myer, Virginia | Military - U.S. Army | NaN | Demonstration | Wright Flyer III | NaN | 1 | 2.0 | 1.0 | 0.0 | During a demonstration flight, a U.S. Army fly... |
| 1 | 07/12/1912 | 06:30 | AtlantiCity, New Jersey | Military - U.S. Navy | NaN | Test flight | Dirigible | NaN | NaN | 5.0 | 5.0 | 0.0 | First U.S. dirigible Akron exploded just offsh... |
| 2 | 08/06/1913 | NaN | Victoria, British Columbia, Canada | Private | - | NaN | Curtiss seaplane | NaN | NaN | 1.0 | 1.0 | 0.0 | The first fatal airplane accident in Canada oc... |
| 3 | 09/09/1913 | 18:30 | Over the North Sea | Military - German Navy | NaN | NaN | Zeppelin L-1 (airship) | NaN | NaN | 20.0 | 14.0 | 0.0 | The airship flew into a thunderstorm and encou... |
| 4 | 10/17/1913 | 10:30 | Near Johannisthal, Germany | Military - German Navy | NaN | NaN | Zeppelin L-2 (airship) | NaN | NaN | 30.0 | 30.0 | 0.0 | Hydrogen gas which was being vented was sucked... |
# melihat isi tiap kolom dari dataset (menentukan perlu cleaning bagian mana)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5268 entries, 0 to 5267 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 5268 non-null object 1 Time 3049 non-null object 2 Location 5248 non-null object 3 Operator 5250 non-null object 4 Flight # 1069 non-null object 5 Route 3561 non-null object 6 Type 5241 non-null object 7 Registration 4933 non-null object 8 cn/In 4040 non-null object 9 Aboard 5246 non-null float64 10 Fatalities 5256 non-null float64 11 Ground 5246 non-null float64 12 Summary 4878 non-null object dtypes: float64(3), object(10) memory usage: 535.2+ KB
df.describe()
| Aboard | Fatalities | Ground | |
|---|---|---|---|
| count | 5246.000000 | 5256.000000 | 5246.000000 |
| mean | 27.554518 | 20.068303 | 1.608845 |
| std | 43.076711 | 33.199952 | 53.987827 |
| min | 0.000000 | 0.000000 | 0.000000 |
| 25% | 5.000000 | 3.000000 | 0.000000 |
| 50% | 13.000000 | 9.000000 | 0.000000 |
| 75% | 30.000000 | 23.000000 | 0.000000 |
| max | 644.000000 | 583.000000 | 2750.000000 |
# 1. Mengubah tipe data date
print(type(df['Date'][0])) # Data date berupa string, alangkah baiknya kita ubah jadi datetime
<class 'str'>
df['Date'] = pd.to_datetime(df['Date'])
df.head()
| Date | Time | Location | Operator | Flight # | Route | Type | Registration | cn/In | Aboard | Fatalities | Ground | Summary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1908-09-17 | 17:18 | Fort Myer, Virginia | Military - U.S. Army | NaN | Demonstration | Wright Flyer III | NaN | 1 | 2.0 | 1.0 | 0.0 | During a demonstration flight, a U.S. Army fly... |
| 1 | 1912-07-12 | 06:30 | AtlantiCity, New Jersey | Military - U.S. Navy | NaN | Test flight | Dirigible | NaN | NaN | 5.0 | 5.0 | 0.0 | First U.S. dirigible Akron exploded just offsh... |
| 2 | 1913-08-06 | NaN | Victoria, British Columbia, Canada | Private | - | NaN | Curtiss seaplane | NaN | NaN | 1.0 | 1.0 | 0.0 | The first fatal airplane accident in Canada oc... |
| 3 | 1913-09-09 | 18:30 | Over the North Sea | Military - German Navy | NaN | NaN | Zeppelin L-1 (airship) | NaN | NaN | 20.0 | 14.0 | 0.0 | The airship flew into a thunderstorm and encou... |
| 4 | 1913-10-17 | 10:30 | Near Johannisthal, Germany | Military - German Navy | NaN | NaN | Zeppelin L-2 (airship) | NaN | NaN | 30.0 | 30.0 | 0.0 | Hydrogen gas which was being vented was sucked... |
# 2. Extract Tahun nya saja dari date
df['Year'] = pd.DatetimeIndex(df['Date']).year
df.head()
| Date | Time | Location | Operator | Flight # | Route | Type | Registration | cn/In | Aboard | Fatalities | Ground | Summary | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1908-09-17 | 17:18 | Fort Myer, Virginia | Military - U.S. Army | NaN | Demonstration | Wright Flyer III | NaN | 1 | 2.0 | 1.0 | 0.0 | During a demonstration flight, a U.S. Army fly... | 1908 |
| 1 | 1912-07-12 | 06:30 | AtlantiCity, New Jersey | Military - U.S. Navy | NaN | Test flight | Dirigible | NaN | NaN | 5.0 | 5.0 | 0.0 | First U.S. dirigible Akron exploded just offsh... | 1912 |
| 2 | 1913-08-06 | NaN | Victoria, British Columbia, Canada | Private | - | NaN | Curtiss seaplane | NaN | NaN | 1.0 | 1.0 | 0.0 | The first fatal airplane accident in Canada oc... | 1913 |
| 3 | 1913-09-09 | 18:30 | Over the North Sea | Military - German Navy | NaN | NaN | Zeppelin L-1 (airship) | NaN | NaN | 20.0 | 14.0 | 0.0 | The airship flew into a thunderstorm and encou... | 1913 |
| 4 | 1913-10-17 | 10:30 | Near Johannisthal, Germany | Military - German Navy | NaN | NaN | Zeppelin L-2 (airship) | NaN | NaN | 30.0 | 30.0 | 0.0 | Hydrogen gas which was being vented was sucked... | 1913 |
# 3. Amputasi data aboard yang NaN
df.dropna(subset=['Aboard'], inplace = True)
df.head()
| Date | Time | Location | Operator | Flight # | Route | Type | Registration | cn/In | Aboard | Fatalities | Ground | Summary | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1908-09-17 | 17:18 | Fort Myer, Virginia | Military - U.S. Army | NaN | Demonstration | Wright Flyer III | NaN | 1 | 2.0 | 1.0 | 0.0 | During a demonstration flight, a U.S. Army fly... | 1908 |
| 1 | 1912-07-12 | 06:30 | AtlantiCity, New Jersey | Military - U.S. Navy | NaN | Test flight | Dirigible | NaN | NaN | 5.0 | 5.0 | 0.0 | First U.S. dirigible Akron exploded just offsh... | 1912 |
| 2 | 1913-08-06 | NaN | Victoria, British Columbia, Canada | Private | - | NaN | Curtiss seaplane | NaN | NaN | 1.0 | 1.0 | 0.0 | The first fatal airplane accident in Canada oc... | 1913 |
| 3 | 1913-09-09 | 18:30 | Over the North Sea | Military - German Navy | NaN | NaN | Zeppelin L-1 (airship) | NaN | NaN | 20.0 | 14.0 | 0.0 | The airship flew into a thunderstorm and encou... | 1913 |
| 4 | 1913-10-17 | 10:30 | Near Johannisthal, Germany | Military - German Navy | NaN | NaN | Zeppelin L-2 (airship) | NaN | NaN | 30.0 | 30.0 | 0.0 | Hydrogen gas which was being vented was sucked... | 1913 |
# 4. Imputasi data ground
df['Ground'] = df['Ground'].fillna(0)
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 5246 entries, 0 to 5267 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 5246 non-null datetime64[ns] 1 Time 3049 non-null object 2 Location 5227 non-null object 3 Operator 5228 non-null object 4 Flight # 1067 non-null object 5 Route 3556 non-null object 6 Type 5221 non-null object 7 Registration 4914 non-null object 8 cn/In 4029 non-null object 9 Aboard 5246 non-null float64 10 Fatalities 5246 non-null float64 11 Ground 5246 non-null float64 12 Summary 4863 non-null object 13 Year 5246 non-null int32 dtypes: datetime64[ns](1), float64(3), int32(1), object(9) memory usage: 594.3+ KB
# 5. Amputasi data NaN yang tersisa
# a. Kolom / Column / Series
del df['Time']
del df['Flight #']
del df['Route']
del df['Registration']
del df['cn/In']
df
| Date | Location | Operator | Type | Aboard | Fatalities | Ground | Summary | Year | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1908-09-17 | Fort Myer, Virginia | Military - U.S. Army | Wright Flyer III | 2.0 | 1.0 | 0.0 | During a demonstration flight, a U.S. Army fly... | 1908 |
| 1 | 1912-07-12 | AtlantiCity, New Jersey | Military - U.S. Navy | Dirigible | 5.0 | 5.0 | 0.0 | First U.S. dirigible Akron exploded just offsh... | 1912 |
| 2 | 1913-08-06 | Victoria, British Columbia, Canada | Private | Curtiss seaplane | 1.0 | 1.0 | 0.0 | The first fatal airplane accident in Canada oc... | 1913 |
| 3 | 1913-09-09 | Over the North Sea | Military - German Navy | Zeppelin L-1 (airship) | 20.0 | 14.0 | 0.0 | The airship flew into a thunderstorm and encou... | 1913 |
| 4 | 1913-10-17 | Near Johannisthal, Germany | Military - German Navy | Zeppelin L-2 (airship) | 30.0 | 30.0 | 0.0 | Hydrogen gas which was being vented was sucked... | 1913 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5263 | 2009-05-20 | Near Madiun, Indonesia | Military - Indonesian Air Force | Lockheed C-130 Hercules | 112.0 | 98.0 | 2.0 | While on approach, the military transport cras... | 2009 |
| 5264 | 2009-05-26 | Near Isiro, DemocratiRepubliCongo | Service Air | Antonov An-26 | 4.0 | 4.0 | 0.0 | The cargo plane crashed while on approach to I... | 2009 |
| 5265 | 2009-06-01 | AtlantiOcean, 570 miles northeast of Natal, Br... | Air France | Airbus A330-203 | 228.0 | 228.0 | 0.0 | The Airbus went missing over the AtlantiOcean ... | 2009 |
| 5266 | 2009-06-07 | Near Port Hope Simpson, Newfoundland, Canada | Strait Air | Britten-Norman BN-2A-27 Islander | 1.0 | 1.0 | 0.0 | The air ambulance crashed into hills while att... | 2009 |
| 5267 | 2009-06-08 | State of Arunachal Pradesh, India | Military - Indian Air Force | Antonov An-32 | 13.0 | 13.0 | 0.0 | The military transport went missing while en r... | 2009 |
5246 rows × 9 columns
# b. baris / row / index
df.dropna(subset=['Location','Operator','Type'], inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 5191 entries, 0 to 5267 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 5191 non-null datetime64[ns] 1 Location 5191 non-null object 2 Operator 5191 non-null object 3 Type 5191 non-null object 4 Aboard 5191 non-null float64 5 Fatalities 5191 non-null float64 6 Ground 5191 non-null float64 7 Summary 4823 non-null object 8 Year 5191 non-null int32 dtypes: datetime64[ns](1), float64(3), int32(1), object(4) memory usage: 385.3+ KB
df = df.reset_index()
del df['index']
df
| Date | Location | Operator | Type | Aboard | Fatalities | Ground | Summary | Year | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1908-09-17 | Fort Myer, Virginia | Military - U.S. Army | Wright Flyer III | 2.0 | 1.0 | 0.0 | During a demonstration flight, a U.S. Army fly... | 1908 |
| 1 | 1912-07-12 | AtlantiCity, New Jersey | Military - U.S. Navy | Dirigible | 5.0 | 5.0 | 0.0 | First U.S. dirigible Akron exploded just offsh... | 1912 |
| 2 | 1913-08-06 | Victoria, British Columbia, Canada | Private | Curtiss seaplane | 1.0 | 1.0 | 0.0 | The first fatal airplane accident in Canada oc... | 1913 |
| 3 | 1913-09-09 | Over the North Sea | Military - German Navy | Zeppelin L-1 (airship) | 20.0 | 14.0 | 0.0 | The airship flew into a thunderstorm and encou... | 1913 |
| 4 | 1913-10-17 | Near Johannisthal, Germany | Military - German Navy | Zeppelin L-2 (airship) | 30.0 | 30.0 | 0.0 | Hydrogen gas which was being vented was sucked... | 1913 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5186 | 2009-05-20 | Near Madiun, Indonesia | Military - Indonesian Air Force | Lockheed C-130 Hercules | 112.0 | 98.0 | 2.0 | While on approach, the military transport cras... | 2009 |
| 5187 | 2009-05-26 | Near Isiro, DemocratiRepubliCongo | Service Air | Antonov An-26 | 4.0 | 4.0 | 0.0 | The cargo plane crashed while on approach to I... | 2009 |
| 5188 | 2009-06-01 | AtlantiOcean, 570 miles northeast of Natal, Br... | Air France | Airbus A330-203 | 228.0 | 228.0 | 0.0 | The Airbus went missing over the AtlantiOcean ... | 2009 |
| 5189 | 2009-06-07 | Near Port Hope Simpson, Newfoundland, Canada | Strait Air | Britten-Norman BN-2A-27 Islander | 1.0 | 1.0 | 0.0 | The air ambulance crashed into hills while att... | 2009 |
| 5190 | 2009-06-08 | State of Arunachal Pradesh, India | Military - Indian Air Force | Antonov An-32 | 13.0 | 13.0 | 0.0 | The military transport went missing while en r... | 2009 |
5191 rows × 9 columns
df['Death'] = df['Fatalities'] + df['Ground']
df.head()
| Date | Location | Operator | Type | Aboard | Fatalities | Ground | Summary | Year | Death | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1908-09-17 | Fort Myer, Virginia | Military - U.S. Army | Wright Flyer III | 2.0 | 1.0 | 0.0 | During a demonstration flight, a U.S. Army fly... | 1908 | 1.0 |
| 1 | 1912-07-12 | AtlantiCity, New Jersey | Military - U.S. Navy | Dirigible | 5.0 | 5.0 | 0.0 | First U.S. dirigible Akron exploded just offsh... | 1912 | 5.0 |
| 2 | 1913-08-06 | Victoria, British Columbia, Canada | Private | Curtiss seaplane | 1.0 | 1.0 | 0.0 | The first fatal airplane accident in Canada oc... | 1913 | 1.0 |
| 3 | 1913-09-09 | Over the North Sea | Military - German Navy | Zeppelin L-1 (airship) | 20.0 | 14.0 | 0.0 | The airship flew into a thunderstorm and encou... | 1913 | 14.0 |
| 4 | 1913-10-17 | Near Johannisthal, Germany | Military - German Navy | Zeppelin L-2 (airship) | 30.0 | 30.0 | 0.0 | Hydrogen gas which was being vented was sucked... | 1913 | 30.0 |
df['Aboard'] = df['Aboard'].astype(int)
df['Fatalities'] = df['Fatalities'].astype(int)
df['Ground'] = df['Ground'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5191 entries, 0 to 5190 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 5191 non-null datetime64[ns] 1 Location 5191 non-null object 2 Operator 5191 non-null object 3 Type 5191 non-null object 4 Aboard 5191 non-null int32 5 Fatalities 5191 non-null int32 6 Ground 5191 non-null int32 7 Summary 4823 non-null object 8 Year 5191 non-null int32 9 Death 5191 non-null float64 dtypes: datetime64[ns](1), float64(1), int32(4), object(4) memory usage: 324.6+ KB
# Analisa jumlah kecelakaan tiap tahun
px.histogram(df,x='Year')
# Analisa jumlah korban jiwa tiap tahun
fig = px.scatter(df, x='Year', y = 'Death')
fig.add_annotation(
x="2001",
y=2815,
text="Anomali pada data, dikarenakan kejadian 9/11",
showarrow=True,
xanchor="right",
)
fig.show()
px.scatter(df['Year'].value_counts().to_frame().reset_index(),
x = 'Year',
y = 'count',
trendline = 'lowess')
px.scatter(df['Operator'].value_counts().to_frame().nlargest(50,'count').reset_index(),
x = 'Operator',
y = 'count',
title = 'Top 50 Maskapai dengan jumlah kecelakaan tertinggi')
px.scatter(df['Type'].value_counts().to_frame().nlargest(50,'count').reset_index(),
x = 'Type',
y = 'count',
title = 'Top 50 Tipe Pesawat dengan jumlah kecelakaan tertinggi')
px.scatter(df[['Operator','Type']].value_counts().to_frame().reset_index(),
x = 'Operator',
y = 'Type',
height = 800)
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_geocoder")
def get_lat_long(location):
while True:
try:
geocode_result = geolocator.geocode(location)
if geocode_result:
return geocode_result.latitude, geocode_result.longitude
else:
return None, None
except Exception as e:
return None, None
df['Latitude'], df['Longitude'] = zip(*df['Location'].apply(get_lat_long))
px.scatter_geo(df,
lat = 'Latitude', lon = 'Longitude',
hover_data = ['Location', 'Year','Death','Summary'],
color = 'Death',
title = 'Peta Kecelakaan Pesawat')